Raw data

- GEO number: GSE128615

- Study summary: To determine the influence of differential Kdm6a expression in immune cells, whole transcriptome analysis for CD4+ T cells from WT and Kdm6a cKO mice were performed using RNA-Seq.

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(gridExtra)
library(pheatmap)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)

Setting AnnotationHub

DB <- "EnsDb"  # Assing your species
AnnotationSpecies <- "mus musculus"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))   # Connect to annotation DB

Running AnnotationHub

# Filter annotation of interest
ahQuery <- query(ah, 
                 pattern=c(DB, AnnotationSpecies), 
                 ignore.case=T)      


# Select the most recent data
DBName <- mcols(ahQuery) %>%
    rownames() %>%
    tail(1)

AnnoDb <- ah[[DBName]] 

# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")

# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="TXID",
                 columns=c("GENEID", "GENENAME")) 



# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
##                 TXID             GENEID GENENAME
## 1 ENSMUST00000000001 ENSMUSG00000000001    Gnai3
## 2 ENSMUST00000000003 ENSMUSG00000000003     Pbsn
## 3 ENSMUST00000000010 ENSMUSG00000020875    Hoxb9
## 4 ENSMUST00000000028 ENSMUSG00000000028    Cdc45
## 5 ENSMUST00000000033 ENSMUSG00000048583     Igf2
## 6 ENSMUST00000000049 ENSMUSG00000000049     Apoh

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
#
# Define sample names 
SampleNames <-  c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3)) 

# Define group level
GroupLevel <- c("WT", "cKO")

# Define contrast for DE analysis
Contrast <- c("Group", "cKO", "WT")


# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC

# Define .sf file path
sf <- c(paste0("salmon_output/", 
               SampleNames,
               ".salmon_quant/quant.sf"))

# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)

rownames(metadata) <- SampleNames

# Explore the metadata
print(metadata)
##            Sample Group                                         Path
## WT-rep1   WT-rep1    WT  salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2   WT-rep2    WT  salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3   WT-rep3    WT  salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1   cKO salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2   cKO salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3   cKO salmon_output/cKO-rep3.salmon_quant/quant.sf

Converting .sf files to txi list

- txi_tpm: stores TPM with the argument “countsFromAbundance=”lengthScaledTPM"

- txi_counts: stores original counts

- In this project, two txi objects were created with or without the countsFromAbundance=“lengthScaledTPM” argument and compared in downstream DE analysis.

- If you don’t want gene-level summarization, set txOut=TRUE.

- References: tximport doc, DESeq2 doc “Why unnormalized counts?”, Soneson et al. 2016, Developer Dr. Love’s comment, Harvard Chan Bioinformatics Core workshop

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames

# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 

txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

# tpm 
head(txi_tpm$counts)
##                       WT-rep1    WT-rep2    WT-rep3     cKO-rep1    cKO-rep2
## ENSMUSG00000000001 6577.34422 5799.47405 5642.76617 5995.7310672 6077.731462
## ENSMUSG00000000003    0.00000    0.00000    0.00000    0.0000000    0.000000
## ENSMUSG00000000028 3920.55098 3550.15001 3714.90523 3421.9627713 3407.169948
## ENSMUSG00000000031   25.55295   61.50787   45.36071    9.5017579   38.695366
## ENSMUSG00000000037   85.39075   75.78040  107.66889  138.5407492  170.159821
## ENSMUSG00000000049    0.00000    0.00000    0.00000    0.9981564    1.995204
##                      cKO-rep3
## ENSMUSG00000000001 5349.19912
## ENSMUSG00000000003    0.00000
## ENSMUSG00000000028 3121.53476
## ENSMUSG00000000031   95.49668
## ENSMUSG00000000037  137.19658
## ENSMUSG00000000049    0.00000
dim(txi_tpm$counts)
## [1] 54347     6
# counts
head(txi_counts$counts)
##                     WT-rep1  WT-rep2  WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492.000 5822.000 5614.000  6040.00  6142.00 5437.000
## ENSMUSG00000000003    0.000    0.000    0.000     0.00     0.00    0.000
## ENSMUSG00000000028 3904.066 3567.989 3714.192  3410.46  3463.04 3142.454
## ENSMUSG00000000031   33.000   45.000   58.000    12.00    28.00   70.000
## ENSMUSG00000000037  119.001   90.001   82.000   165.00   129.00  100.000
## ENSMUSG00000000049    0.000    0.000    0.000     1.00     2.00    0.000
dim(txi_counts$counts)
## [1] 54347     6

Creating DESeq objects from txi and VST

- Note: The tximport-to-DESeq2 approach uses estimated gene counts from the transcript abundance quantifiers, but not normalized counts.

- The DESeqDataSetFromTximport() function generated an DESeq object (aka dds) with the txi input.

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 

    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)

    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)

    # Output them as a list 
    return(list(dds=des, vsd=ves))

}

TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 54347 6 
## metadata(1): version
## assays(1): counts
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##                    WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001    6577    5799    5643     5996     6078     5349
## ENSMUSG00000000003       0       0       0        0        0        0
## ENSMUSG00000000028    3921    3550    3715     3422     3407     3122
## ENSMUSG00000000031      26      62      45       10       39       95
## ENSMUSG00000000037      85      76     108      139      170      137
## ENSMUSG00000000049       0       0       0        1        2        0
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 54347 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##                    WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001    6492    5822    5614     6040     6142     5437
## ENSMUSG00000000003       0       0       0        0        0        0
## ENSMUSG00000000028    3904    3568    3714     3410     3463     3142
## ENSMUSG00000000031      33      45      58       12       28       70
## ENSMUSG00000000037     119      90      82      165      129      100
## ENSMUSG00000000049       0       0       0        1        2        0

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 54347 6 
## metadata(1): version
## assays(1): ''
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 54347 6 
## metadata(1): version
## assays(1): ''
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- Messages when “Counts <- DESeqPrep_fn(Counts)” was run: using ‘avgTxLength’ from assays(dds)

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##   WT-rep1   WT-rep2   WT-rep3  cKO-rep1  cKO-rep2  cKO-rep3 
## 1.1105548 0.9995683 0.9771351 1.0201440 1.0317426 0.8932568
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##            Sample    Group                   Path
##          <factor> <factor>            <character>
## WT-rep1  WT-rep1       WT  salmon_output/WT-rep..
## WT-rep2  WT-rep2       WT  salmon_output/WT-rep..
## WT-rep3  WT-rep3       WT  salmon_output/WT-rep..
## cKO-rep1 cKO-rep1      cKO salmon_output/cKO-re..
## cKO-rep2 cKO-rep2      cKO salmon_output/cKO-re..
## cKO-rep3 cKO-rep3      cKO salmon_output/cKO-re..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##            Sample    Group                   Path sizeFactor
##          <factor> <factor>            <character>  <numeric>
## WT-rep1  WT-rep1       WT  salmon_output/WT-rep..   1.110555
## WT-rep2  WT-rep2       WT  salmon_output/WT-rep..   0.999568
## WT-rep3  WT-rep3       WT  salmon_output/WT-rep..   0.977135
## cKO-rep1 cKO-rep1      cKO salmon_output/cKO-re..   1.020144
## cKO-rep2 cKO-rep2      cKO salmon_output/cKO-re..   1.031743
## cKO-rep3 cKO-rep3      cKO salmon_output/cKO-re..   0.893257

Plotting the size factors in TPM

- The size factors are only available from TPM dds

- Blue dashed line: normalization factor = 1

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))

colnames(sizeFactor) <- 'Size_Factor'

sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 

sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)



# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")

Plotting nornalization factors in the Counts

- DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples. The counts for a gene in each sample is then divided by this mean. The median of these ratios in a sample is the size factor for that sample.

- Blue dashed line: normalization factor = 1

- Colored dots: normlization factors per gene (y-axis) in each sample

- Box plots: distribution of the normalization facters in each group (see the second plot)

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA, alpha=0.5) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.8, 1.2)

Setting functions for QC plots

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1] 

# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {

    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}

# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]

# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {

    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd

    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis (PCA)

- Checkpoints in PCA: source of variation, sample outlier

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

- Checkpoints of correlation heatmap: distance between samples, correlation in a group

- Upper: TPM input

- Lower: Counts input

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Running DE analysis with or without shrinkage

- Shrinkage reduces false positives

- Magnitude of shrinkage is affected by dispersion and sample size

- When the argument type is set to “apeglm”, the coef argument is used instead of constrast. In this dataset, you can set coef=Coef where Coef <- resultsNames(ddsList[[1]]).

- When the type is set to “normal”, the argument contrast is set as shown below.

- References: DESeq2 doc “Alternative shrinkage estimators”, Harvard Chan Bioinformatics Core workshop

# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]]) 
names(desList) <- TvC

# Create a list for TPM and Counts dds 
ddsList <- desList  # DE without shrinkage  
normal.ddsList <- desList    # DE with "normal" shrinkage
ape.ddsList <- desList       # DE with "apeglm" shrinkage
ash.ddsList <- desList       # DE with "ashr" shrinkage

for (x in TvC) {
    
    # Run DESeq() 
    ddsList[[x]] <- DESeq(desList[[x]])
    print(resultsNames(ddsList[[x]]))

    normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                                contrast=Contrast,
                                type="normal")

    ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="apeglm")

    ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="ashr")

}
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Plot dispersion  
for (x in TvC) {

    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Extracting DE results with or without shrinkage

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold 
alpha=0.1 

# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 

# Initialize lists of result tables 
all.resList <- all.ddsList 

# Set a function cleaning table
Sig.fn <- function(df, Input) {
    
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}




for (i in shrinkage) {

    if (i == "None") {

        for (x in TvC) {

        # Extract data frames from unshrunken lfc & clean data 
        all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]], 
                                                       contrast=Contrast, 
                                                       alpha=alpha)) %>% 
        Sig.fn(x)

         } 
    } else {

        # Extract data frames from shrunken lfc & clean data
        for (x in TvC) {

            all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
                Sig.fn(x)
    

        }
    }
}





# Explore the results 
summary(all.resList)
##        Length Class  Mode
## None   2      -none- list
## Normal 2      -none- list
## Apeglm 2      -none- list
## Ashr   2      -none- list
head(all.resList[[1]][['TPM']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5875.9369080     0.02099816 0.05786985  0.3628515
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3505.9693632    -0.10055360 0.07303168 -1.3768491
## 4 ENSMUSG00000000031   47.5744267     0.22519197 0.73238390  0.3074780
## 5 ENSMUSG00000000037  119.5824586     0.79057118 0.24424258  3.2368278
## 6 ENSMUSG00000000049    0.4864536     2.44892110 3.95081414  0.6198523
##        pvalue       padj   FDR Input
## 1 0.716715835 0.94295890 > 0.1   TPM
## 2          NA         NA > 0.1   TPM
## 3 0.168558919 0.59608179 > 0.1   TPM
## 4 0.758479529 0.95405920 > 0.1   TPM
## 5 0.001208663 0.03084828 < 0.1   TPM
## 6 0.535355054         NA > 0.1   TPM
head(all.resList[[1]][['Counts']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5914.7607747     0.02127119 0.05884416  0.3614834
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3528.9553327    -0.10034826 0.07366789 -1.3621709
## 4 ENSMUSG00000000031   45.9580425     0.22330899 0.74116522  0.3012945
## 5 ENSMUSG00000000037  116.2538103     0.80398885 0.24294058  3.3094053
## 6 ENSMUSG00000000049    0.4886878     2.45560171 3.86195822  0.6358437
##         pvalue      padj   FDR  Input
## 1 0.7177381103 0.9481132 > 0.1 Counts
## 2           NA        NA > 0.1 Counts
## 3 0.1731439441 0.6154148 > 0.1 Counts
## 4 0.7631899485 0.9585726 > 0.1 Counts
## 5 0.0009349437 0.0258125 < 0.1 Counts
## 6 0.5248783168        NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5875.9369080     0.02046530 0.05640137  0.3628515
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3505.9693632    -0.09654963 0.07012366 -1.3768491
## 4 ENSMUSG00000000031   47.5744267     0.04345855 0.14166307  0.3074780
## 5 ENSMUSG00000000037  119.5824586     0.53991562 0.16670298  3.2368278
## 6 ENSMUSG00000000049    0.4864536     0.02252178 0.03169874  0.6198523
##        pvalue       padj   FDR Input
## 1 0.716715835 0.94295890 > 0.1   TPM
## 2          NA         NA > 0.1   TPM
## 3 0.168558919 0.59608179 > 0.1   TPM
## 4 0.758479529 0.95405920 > 0.1   TPM
## 5 0.001208663 0.03084828 < 0.1   TPM
## 6 0.535355054         NA > 0.1   TPM
head(all.resList[[2]][['Counts']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5914.7607747     0.02071745 0.05731238  0.3614834
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3528.9553327    -0.09631344 0.07070590 -1.3621709
## 4 ENSMUSG00000000031   45.9580425     0.04230654 0.14143386  0.3012945
## 5 ENSMUSG00000000037  116.2538103     0.55120829 0.16681149  3.3094053
## 6 ENSMUSG00000000049    0.4886878     0.02375283 0.03262421  0.6358437
##         pvalue      padj   FDR  Input
## 1 0.7177381103 0.9481132 > 0.1 Counts
## 2           NA        NA > 0.1 Counts
## 3 0.1731439441 0.6154148 > 0.1 Counts
## 4 0.7631899485 0.9585726 > 0.1 Counts
## 5 0.0009349437 0.0258125 < 0.1 Counts
## 6 0.5248783168        NA > 0.1 Counts

Exploring mean-difference relationship with MA plots

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

- Reference: DESeq2 doc “MA-plot”

# Set ylim: has to adjusted by users depending on data 
yl <- c(-7.5, 7.5)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Initialize a list storing MA plots
MAList <- ddsList


# Create MA plots

for (i in shrinkage) {

    both.df <- rbind(all.resList[[i]][[TvC[1]]], 
                     all.resList[[i]][[TvC[2]]])

    MAList[[i]] <- ggplot(both.df, 
                              aes(x=baseMean, y=log2FoldChange, color=FDR))  +geom_point() + scale_x_log10() + facet_grid(~Input) + 
                                   theme_bw() + 
                                   scale_color_manual(values=c("blue", "grey")) +
                                   ggtitle(paste("MA plot with", i)) + 
                                   ylim(yl[1], yl[2]) + 
                                   geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red") 

}

   

# Print MA plots
MAList
## $TPM
## class: DESeqDataSet 
## dim: 54347 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
## 
## $Counts
## class: DESeqDataSet 
## dim: 54347 6 
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
## 
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

Exploring distribution of false discovery rate (FDR)

- Distribution of adjusted p-val (FDR) was presented

- x-axis: FDR

- y-axis: Number of genes

- Black dashed line: FDR = alpha

# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']], 
                 all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)

# Plot distribution of FDR
ggplot(both.df,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") + 
xlab("Adjusted P-Value (FDR)") + 
ylab("Count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Exploring distribution of log2FoldChange by input type

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

# Initialize a lfc list
lfcplotList <- all.resList 

# Create lfc distribution plots
for (i in shrinkage) {

    lfc.df <- rbind(all.resList[[i]][[TvC[1]]], 
                    all.resList[[i]][[TvC[2]]])

    lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]

    lfc.df$Input <- factor(lfc.df$Input, levels=TvC)

    lfcplotList[[i]] <- ggplot(lfc.df,  # Subset rows with FDR < alpha
                               aes(x=log2FoldChange, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() + ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black", 
           linetype="dashed", 
           size=1) + 
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) + 
xlim(-5, 5)


}

# Print the lfc plots
lfcplotList
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 


# Create a data frame storing NA gene number
NAstat <- both.df %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))

# Plot number of NA genes 
ggplot(NAstat, 
       aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Ranking DEGs with the TPM and original count inputs

- fdr.rank: ranked by FDR

- lfc.rank: ranked by absolute fold change

- up.lfc.rank: ranked by magnitude of fold change increase

- down.lfc.rank: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList

# Set a function cleaning a data frame 
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}

# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}


for (i in shrinkage) {

    for (x in TvC) {

        rankdf <- all.resList[[i]][[x]]

        fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn() 

        lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()

        up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% 
            arrange(desc(log2FoldChange)) %>% 
            Ranking.fn()

        down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
            arrange(log2FoldChange) %>%
            Ranking.fn()

    }
}

# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
##                  Gene   baseMean log2FoldChange     lfcSE      stat
## 1: ENSMUSG00000026463  1486.8997      -1.696086 0.1216619 -13.94098
## 2: ENSMUSG00000022018   637.3808      -1.277895 0.1155273 -11.06141
## 3: ENSMUSG00000032011 16059.7891      -1.909339 0.1840373 -10.37473
## 4: ENSMUSG00000040498   712.2468      -2.230573 0.2167892 -10.28913
## 5: ENSMUSG00000047180  1001.3846      -1.412164 0.1395955 -10.11611
## 6: ENSMUSG00000029468  1457.2985      -1.743485 0.1739533 -10.02272
##          pvalue         padj   FDR Input Rank
## 1: 3.570101e-44 4.829276e-40 < 0.1   TPM    1
## 2: 1.930438e-28 1.305652e-24 < 0.1   TPM    2
## 3: 3.231112e-25 1.456908e-21 < 0.1   TPM    3
## 4: 7.888262e-25 2.667613e-21 < 0.1   TPM    4
## 5: 4.686796e-24 1.267966e-20 < 0.1   TPM    5
## 6: 1.211213e-23 2.730679e-20 < 0.1   TPM    6
head(fdr.rank[[1]][[2]])
##                  Gene   baseMean log2FoldChange     lfcSE      stat
## 1: ENSMUSG00000026463  1495.3259      -1.697168 0.1219119 -13.92127
## 2: ENSMUSG00000022018   641.6187      -1.276915 0.1151204 -11.09200
## 3: ENSMUSG00000032011 16149.4069      -1.909044 0.1840118 -10.37457
## 4: ENSMUSG00000040498   716.4666      -2.229149 0.2156284 -10.33792
## 5: ENSMUSG00000047180  1007.8261      -1.412092 0.1391644 -10.14693
## 6: ENSMUSG00000029468  1456.8887      -1.743020 0.1738787 -10.02434
##          pvalue         padj   FDR  Input Rank
## 1: 4.704485e-44 6.637087e-40 < 0.1 Counts    1
## 2: 1.371801e-28 9.676681e-25 < 0.1 Counts    2
## 3: 3.236503e-25 1.522019e-21 < 0.1 Counts    3
## 4: 4.747503e-25 1.674444e-21 < 0.1 Counts    4
## 5: 3.419387e-24 9.648143e-21 < 0.1 Counts    5
## 6: 1.191522e-23 2.801665e-20 < 0.1 Counts    6
head(lfc.rank[[1]][[1]])
##                  Gene baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000102298 85.84981       9.904294 1.2072556  8.203975 2.325671e-16
## 2: ENSMUSG00000103587 27.65904       8.271554 1.2479720  6.627997 3.402732e-11
## 3: ENSMUSG00000081228 17.89201      -7.573484 1.2876161 -5.881787 4.058600e-09
## 4: ENSMUSG00000026616 42.71798      -5.132552 1.7434396 -2.943923 3.240807e-03
## 5: ENSMUSG00000052105 21.86338       4.433258 0.8809757  5.032213 4.848491e-07
## 6: ENSMUSG00000025929 39.39414      -3.504122 0.7724056 -4.536635 5.715904e-06
##            padj   FDR Input Rank
## 1: 2.621612e-13 < 0.1   TPM    1
## 2: 1.438399e-08 < 0.1   TPM    2
## 3: 1.098014e-06 < 0.1   TPM    3
## 4: 6.200622e-02 < 0.1   TPM    4
## 5: 6.760611e-05 < 0.1   TPM    5
## 6: 4.924779e-04 < 0.1   TPM    6
head(lfc.rank[[1]][[2]])
##                  Gene baseMean log2FoldChange    lfcSE      stat       pvalue
## 1: ENSMUSG00000102298 86.40658       9.912934 1.206247  8.217994 2.069355e-16
## 2: ENSMUSG00000103587 28.01962       8.289688 1.244466  6.661242 2.715225e-11
## 3: ENSMUSG00000081228 17.90003      -7.565160 1.283233 -5.895388 3.738009e-09
## 4: ENSMUSG00000091071 92.97825      -6.586493 2.094340 -3.144902 1.661426e-03
## 5: ENSMUSG00000026616 22.99255      -5.134312 1.433372 -3.581982 3.409973e-04
## 6: ENSMUSG00000052105 12.95269       4.486248 0.851751  5.267089 1.386040e-07
##            padj   FDR  Input Rank
## 1: 2.432872e-13 < 0.1 Counts    1
## 2: 1.197075e-08 < 0.1 Counts    2
## 3: 1.054717e-06 < 0.1 Counts    3
## 4: 3.952681e-02 < 0.1 Counts    4
## 5: 1.276072e-02 < 0.1 Counts    5
## 6: 2.300501e-05 < 0.1 Counts    6
head(up.lfc.rank[[1]][[1]])
##                  Gene baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSMUSG00000102298 85.84981       9.904294 1.2072556 8.203975 2.325671e-16
## 2: ENSMUSG00000103587 27.65904       8.271554 1.2479720 6.627997 3.402732e-11
## 3: ENSMUSG00000052105 21.86338       4.433258 0.8809757 5.032213 4.848491e-07
## 4: ENSMUSG00000027860 17.78000       2.864749 0.9491109 3.018351 2.541545e-03
## 5: ENSMUSG00000067599 51.07036       2.720996 0.5866686 4.638046 3.517192e-06
## 6: ENSMUSG00000049881 39.64867       2.712169 0.7863516 3.449054 5.625549e-04
##            padj   FDR Input Rank
## 1: 2.621612e-13 < 0.1   TPM    1
## 2: 1.438399e-08 < 0.1   TPM    2
## 3: 6.760611e-05 < 0.1   TPM    3
## 4: 5.232798e-02 < 0.1   TPM    4
## 5: 3.258702e-04 < 0.1   TPM    5
## 6: 1.752513e-02 < 0.1   TPM    6
head(up.lfc.rank[[1]][[2]])
##                  Gene baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSMUSG00000102298 86.40658       9.912934 1.2062475 8.217994 2.069355e-16
## 2: ENSMUSG00000103587 28.01962       8.289688 1.2444658 6.661242 2.715225e-11
## 3: ENSMUSG00000052105 12.95269       4.486248 0.8517510 5.267089 1.386040e-07
## 4: ENSMUSG00000085328 12.82875       3.508424 1.2470414 2.813398 4.902089e-03
## 5: ENSMUSG00000094825 12.92533       3.215367 0.9713344 3.310257 9.321022e-04
## 6: ENSMUSG00000043932 14.95823       3.169154 0.8873769 3.571373 3.551149e-04
##            padj   FDR  Input Rank
## 1: 2.432872e-13 < 0.1 Counts    1
## 2: 1.197075e-08 < 0.1 Counts    2
## 3: 2.300501e-05 < 0.1 Counts    3
## 4: 8.332370e-02 < 0.1 Counts    4
## 5: 2.578450e-02 < 0.1 Counts    5
## 6: 1.308084e-02 < 0.1 Counts    6
head(down.lfc.rank[[1]][[1]])
##                  Gene baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000081228 17.89201      -7.573484 1.2876161 -5.881787 4.058600e-09
## 2: ENSMUSG00000026616 42.71798      -5.132552 1.7434396 -2.943923 3.240807e-03
## 3: ENSMUSG00000025929 39.39414      -3.504122 0.7724056 -4.536635 5.715904e-06
## 4: ENSMUSG00000038305 27.67164      -3.315884 0.9145036 -3.625884 2.879744e-04
## 5: ENSMUSG00000037129 21.70495      -2.996111 0.6636694 -4.514463 6.347751e-06
## 6: ENSMUSG00000030769 61.98969      -2.895925 0.7804593 -3.710539 2.068186e-04
##            padj   FDR Input Rank
## 1: 1.098014e-06 < 0.1   TPM    1
## 2: 6.200622e-02 < 0.1   TPM    2
## 3: 4.924779e-04 < 0.1   TPM    3
## 4: 1.088098e-02 < 0.1   TPM    4
## 5: 5.400379e-04 < 0.1   TPM    5
## 6: 8.452072e-03 < 0.1   TPM    6
head(down.lfc.rank[[1]][[2]])
##                  Gene baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000081228 17.90003      -7.565160 1.2832335 -5.895388 3.738009e-09
## 2: ENSMUSG00000091071 92.97825      -6.586493 2.0943398 -3.144902 1.661426e-03
## 3: ENSMUSG00000026616 22.99255      -5.134312 1.4333718 -3.581982 3.409973e-04
## 4: ENSMUSG00000017002 13.01577      -3.816359 1.0324219 -3.696511 2.185830e-04
## 5: ENSMUSG00000096349 14.54338      -3.639189 1.2724638 -2.859955 4.237012e-03
## 6: ENSMUSG00000025929 39.64370      -3.514489 0.7624824 -4.609272 4.040815e-06
##            padj   FDR  Input Rank
## 1: 1.054717e-06 < 0.1 Counts    1
## 2: 3.952681e-02 < 0.1 Counts    2
## 3: 1.276072e-02 < 0.1 Counts    3
## 4: 9.096662e-03 < 0.1 Counts    4
## 5: 7.500097e-02 < 0.1 Counts    5
## 6: 3.878082e-04 < 0.1 Counts    6

Calculating rank difference

# Set a function rebuilding DE tables with gene ranks 
rankdiff.fn <- function(List){

    # Select columns and join the data frames by gene
    full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)], 
              List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)], 
              by="Gene") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 

# Set a function adding Shrinkage column 
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)} 

# Set a function rbinding multiple data frames 
rbinds.fn <- function(List) {rbind(List[[1]], 
                                   List[[2]], 
                                   List[[3]], 
                                   List[[4]])}



# Calculate and plot rank difference
for (i in shrinkage) {

    # Calculate rank difference
    fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
    lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
    up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
    down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i) 

}

# Create combined data frames across the shrinkages 
fdr.rank.df <- rbinds.fn(fdr.rank) 
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)



# Explore the ranking outputs
head(fdr.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000026463     TPM      1  1486.8997  Counts      1  1495.3259
## 2: ENSMUSG00000022018     TPM      2   637.3808  Counts      2   641.6187
## 3: ENSMUSG00000032011     TPM      3 16059.7891  Counts      3 16149.4069
## 4: ENSMUSG00000040498     TPM      4   712.2468  Counts      4   716.4666
## 5: ENSMUSG00000047180     TPM      5  1001.3846  Counts      5  1007.8261
## 6: ENSMUSG00000029468     TPM      6  1457.2985  Counts      6  1456.8887
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          7.711801        0        1      None
## 2:          6.865046        0        2      None
## 3:         10.091397        0        3      None
## 4:          6.975862        0        4      None
## 5:          7.316746        0        5      None
## 6:          7.689711        0        6      None
head(lfc.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298     TPM      1   85.84981  Counts      1   86.40658
## 2: ENSMUSG00000103587     TPM      2   27.65904  Counts      2   28.01962
## 3: ENSMUSG00000081228     TPM      3   17.89201  Counts      3   17.90003
## 4: ENSMUSG00000026616     TPM      4   42.71798  Counts      5   22.99255
## 5: ENSMUSG00000052105     TPM      5   21.86338  Counts      6   12.95269
## 6: ENSMUSG00000025929     TPM      6   39.39414  Counts      9   39.64370
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          4.860224        0      1.0      None
## 2:          3.729754        0      2.0      None
## 3:          3.289969        0      3.0      None
## 4:          3.992944       -1      4.5      None
## 5:          3.344265       -1      5.5      None
## 6:          4.081192       -3      7.5      None
head(up.lfc.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298     TPM      1   85.84981  Counts      1   86.40658
## 2: ENSMUSG00000103587     TPM      2   27.65904  Counts      2   28.01962
## 3: ENSMUSG00000052105     TPM      3   21.86338  Counts      3   12.95269
## 4: ENSMUSG00000027860     TPM      4   17.78000    <NA>     NA         NA
## 5: ENSMUSG00000067599     TPM      5   51.07036  Counts      8   51.54967
## 6: ENSMUSG00000049881     TPM      6   39.64867  Counts      9   31.82926
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          4.860224        0      1.0      None
## 2:          3.729754        0      2.0      None
## 3:          3.344265        0      3.0      None
## 4:                NA       NA       NA      None
## 5:          4.341793       -3      6.5      None
## 6:          4.017523       -3      7.5      None
head(down.lfc.rank.df)
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000081228     TPM      1   17.89201  Counts      1   17.90003
## 2: ENSMUSG00000026616     TPM      2   42.71798  Counts      3   22.99255
## 3: ENSMUSG00000025929     TPM      3   39.39414  Counts      6   39.64370
## 4: ENSMUSG00000038305     TPM      4   27.67164  Counts      7   22.49073
## 5: ENSMUSG00000037129     TPM      5   21.70495  Counts      8   21.76394
## 6: ENSMUSG00000030769     TPM      6   61.98969  Counts     10   51.13445
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          3.289969        0      1.0      None
## 2:          3.992944       -1      2.5      None
## 3:          4.081192       -3      4.5      None
## 4:          3.661431       -3      5.5      None
## 5:          3.483911       -3      6.5      None
## 6:          4.472289       -4      8.0      None

Visualizing DEG ranks I: TPM- vs Counts-input

- x-axis: rank with TPM input

- y-axis: rank with Counts input

- Black diagonal lines: rank with TPM = rank with Counts

- Dot color: gene expression level (log-baseMean)

# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, 
           aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red") 
}

# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")

ranking.plot.fn(lfc.rank.df, "Log2FoldChange")

ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")

ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")

Visualizing DEG ranks II: Rank difference

- x-axis: expression level (log-baseMean)

- y-axis: rank difference (rank with TPM - rank with Counts)

- Black horizontal lines: rank with TPM = rank with Counts

- Dot color: average rank

# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        facet_grid(~Shrinkage) + 
        theme(strip.text.x=element_text(size=10)) + 
        ylab("Rank Difference (TPM - Count)") + 
        ggtitle(paste("Rank Difference (TPM - Count):", rankedby)) + 
        geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") 
}

# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR")

rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange")

rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")

rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")

Distribution of rank difference in unshrunken data

- y-axis: abs(TPM-Count inputs)

- x-axis: FDR or log2FoldChange (Increase/Decrease)

# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}

# Create a list storing rankdiff data frames 
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
                 unshr.rankdiff.fn(lfc.rank.df),
                 unshr.rankdiff.fn(up.lfc.rank.df),
                 unshr.rankdiff.fn(down.lfc.rank.df))


# Assine result column as a factor with levels 
rankdiff.levels <- c("FDR", 
                     "log2FoldChange", 
                     "log2FoldChange.Increase", 
                     "log2FoldChange.Decrease")


# Name the list 
names(rankList) <- rankdiff.levels

# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff), 
                            log2FoldChange=abs(rankList[[2]]$RankDiff),
                            log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
                            log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff) 

rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)

# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
       aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) + 
geom_boxplot(alpha=0.5, fill="grey", color="black") + 
theme_bw() + 
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n(TPM - Count Input)") + 
ylab("Absolute Rank Difference (TPM - Count Input)") 

Relationship between rank difference and number of transcript versions

- y-axis: abs(TPM-Count inputs)

- x-axis: number of transcripts (number of transcript id / gene id)

- dot color: mean rank

- 16 genes were missing in the plots

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(GENEID) %>% 
    summarize(num.trans=n_distinct(TXID))

# Set a function adding the number of transcripts by gene id 
add.ntrans.fn <- function(df) {

    left_join(df, AnnoDb.ntrans, by=c("Gene"="GENEID"))}




# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {

    rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}


# Explore the edited data frames
summary(rankList)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
head(rankList[[1]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000026463     TPM      1  1486.8997  Counts      1  1495.3259
## 2: ENSMUSG00000022018     TPM      2   637.3808  Counts      2   641.6187
## 3: ENSMUSG00000032011     TPM      3 16059.7891  Counts      3 16149.4069
## 4: ENSMUSG00000040498     TPM      4   712.2468  Counts      4   716.4666
## 5: ENSMUSG00000047180     TPM      5  1001.3846  Counts      5  1007.8261
## 6: ENSMUSG00000029468     TPM      6  1457.2985  Counts      6  1456.8887
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          7.711801        0        1      None         8
## 2:          6.865046        0        2      None         1
## 3:         10.091397        0        3      None         6
## 4:          6.975862        0        4      None         4
## 5:          7.316746        0        5      None         2
## 6:          7.689711        0        6      None         8
head(rankList[[2]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298     TPM      1   85.84981  Counts      1   86.40658
## 2: ENSMUSG00000103587     TPM      2   27.65904  Counts      2   28.01962
## 3: ENSMUSG00000081228     TPM      3   17.89201  Counts      3   17.90003
## 4: ENSMUSG00000026616     TPM      4   42.71798  Counts      5   22.99255
## 5: ENSMUSG00000052105     TPM      5   21.86338  Counts      6   12.95269
## 6: ENSMUSG00000025929     TPM      6   39.39414  Counts      9   39.64370
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          4.860224        0      1.0      None         1
## 2:          3.729754        0      2.0      None         1
## 3:          3.289969        0      3.0      None         1
## 4:          3.992944       -1      4.5      None        13
## 5:          3.344265       -1      5.5      None        10
## 6:          4.081192       -3      7.5      None         1
head(rankList[[3]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298     TPM      1   85.84981  Counts      1   86.40658
## 2: ENSMUSG00000103587     TPM      2   27.65904  Counts      2   28.01962
## 3: ENSMUSG00000052105     TPM      3   21.86338  Counts      3   12.95269
## 4: ENSMUSG00000027860     TPM      4   17.78000    <NA>     NA         NA
## 5: ENSMUSG00000067599     TPM      5   51.07036  Counts      8   51.54967
## 6: ENSMUSG00000049881     TPM      6   39.64867  Counts      9   31.82926
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          4.860224        0      1.0      None         1
## 2:          3.729754        0      2.0      None         1
## 3:          3.344265        0      3.0      None        10
## 4:                NA       NA       NA      None         8
## 5:          4.341793       -3      6.5      None         4
## 6:          4.017523       -3      7.5      None         4
head(rankList[[4]])
##                  Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000081228     TPM      1   17.89201  Counts      1   17.90003
## 2: ENSMUSG00000026616     TPM      2   42.71798  Counts      3   22.99255
## 3: ENSMUSG00000025929     TPM      3   39.39414  Counts      6   39.64370
## 4: ENSMUSG00000038305     TPM      4   27.67164  Counts      7   22.49073
## 5: ENSMUSG00000037129     TPM      5   21.70495  Counts      8   21.76394
## 6: ENSMUSG00000030769     TPM      6   61.98969  Counts     10   51.13445
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          3.289969        0      1.0      None         1
## 2:          3.992944       -1      2.5      None        13
## 3:          4.081192       -3      4.5      None         1
## 4:          3.661431       -3      5.5      None        15
## 5:          3.483911       -3      6.5      None         1
## 6:          4.472289       -4      8.0      None         8
# Set a function plotting rank difference vs number of transcripts 
rank.ntrans.plot.fn <- function(df, title) {

    ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) + 
        geom_jitter(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \nin", title)) + 
        xlab("Number of Alternative Transcripts") + 
        ylab("Absolute Rank Difference (TPM - Counts)") + scale_color_gradient(low="blue", high="red") 
}

# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")

rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")

rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")

rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")

Finding genes having large difference in rankings

# Initialize a list storing rankdiff genes 
large.rankdiff <- rankList

# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)

names(rankdiff.thr) <- rankdiff.levels

for (x in rankdiff.levels) {

    # Filter out observations below the rankdiff thresholds
    large.rankdiff[[x]] <- subset(rankList[[x]], 
                                      abs(RankDiff) > rankdiff.thr[x]) 

}

# Explore the filtered genes 
summary(large.rankdiff)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 364  12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 713  12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1]  5 12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 13 12
# Set a function saving rankdiff.csv files
saving.fn <- function(input.df, data.type) { 

    filename <- paste0("tpmVScount_", data.type, "_RankDiff.csv")

    return(write.csv(input.df, filename))
}



# Save the filtered and arranged data frames as csv files
saving.fn(large.rankdiff[[rankdiff.levels[1]]], "FDR")
saving.fn(large.rankdiff[[rankdiff.levels[2]]], "LFC")
saving.fn(large.rankdiff[[rankdiff.levels[3]]], "LFC_Increase")
saving.fn(large.rankdiff[[rankdiff.levels[4]]], "LFC_Decrease")

Summarizing up/down DEGs with an upset plot

- red bar: input type

- blue bar: directionality of gene expression change

# Set a function cleaning data to generate upset plots 
upset.input.fn <- function(df) {

    df <- df %>% 

        # Filter genes with valid padj 
        subset(!is.na(padj)) %>% 
        
        mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
               
               Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
               
               Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
               
               TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
               
               Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input? 

    # Create a list storing groups of interest
    upsetInput <- list(Up=df$Up, 
                       Down=df$Down, 
                       Unchanged=df$Unchanged, 
                       TPM_Input=df$TPM, 
                       Counts_Input=df$Counts) 

    return(upsetInput)

}

upsetList <- upset.input.fn(both.df)


# Create the upset plot 
upset(fromList(upsetList), 
      sets=names(upsetList),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red","blue", "blue", "blue"),
      text.scale = 1.5, number.angles=30) 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ashr_2.2-47                 apeglm_1.12.0              
##  [3] ensembldb_2.14.0            AnnotationFilter_1.14.0    
##  [5] GenomicFeatures_1.42.1      AnnotationDbi_1.52.0       
##  [7] UpSetR_1.4.0                pheatmap_1.0.12            
##  [9] gridExtra_2.3               DESeq2_1.30.0              
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [13] MatrixGenerics_1.2.0        matrixStats_0.57.0         
## [15] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [17] IRanges_2.24.0              S4Vectors_0.28.1           
## [19] tximport_1.18.0             forcats_0.5.0              
## [21] stringr_1.4.0               dplyr_1.0.2                
## [23] purrr_0.3.4                 readr_1.4.0                
## [25] tidyr_1.1.2                 tibble_3.0.4               
## [27] ggplot2_3.3.2               tidyverse_1.3.0            
## [29] AnnotationHub_2.22.0        BiocFileCache_1.14.0       
## [31] dbplyr_2.0.0                BiocGenerics_0.36.0        
## [33] rmarkdown_2.5               data.table_1.13.4          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                  backports_1.2.1              
##   [3] plyr_1.8.6                    lazyeval_0.2.2               
##   [5] splines_4.0.3                 BiocParallel_1.24.1          
##   [7] digest_0.6.27                 invgamma_1.1                 
##   [9] htmltools_0.5.0               SQUAREM_2020.5               
##  [11] fansi_0.4.1                   magrittr_2.0.1               
##  [13] memoise_1.1.0                 Biostrings_2.58.0            
##  [15] annotate_1.68.0               modelr_0.1.8                 
##  [17] askpass_1.1                   bdsmatrix_1.3-4              
##  [19] prettyunits_1.1.1             colorspace_2.0-0             
##  [21] blob_1.2.1                    rvest_0.3.6                  
##  [23] rappdirs_0.3.1                haven_2.3.1                  
##  [25] xfun_0.19                     crayon_1.3.4                 
##  [27] RCurl_1.98-1.2                jsonlite_1.7.2               
##  [29] genefilter_1.72.0             survival_3.2-7               
##  [31] glue_1.4.2                    gtable_0.3.0                 
##  [33] zlibbioc_1.36.0               XVector_0.30.0               
##  [35] DelayedArray_0.16.0           scales_1.1.1                 
##  [37] mvtnorm_1.1-1                 DBI_1.1.0                    
##  [39] Rcpp_1.0.5                    xtable_1.8-4                 
##  [41] progress_1.2.2                emdbook_1.3.12               
##  [43] bit_4.0.4                     truncnorm_1.0-8              
##  [45] httr_1.4.2                    RColorBrewer_1.1-2           
##  [47] ellipsis_0.3.1                farver_2.0.3                 
##  [49] pkgconfig_2.0.3               XML_3.99-0.5                 
##  [51] locfit_1.5-9.4                labeling_0.4.2               
##  [53] tidyselect_1.1.0              rlang_0.4.9                  
##  [55] later_1.1.0.1                 munsell_0.5.0                
##  [57] BiocVersion_3.12.0            cellranger_1.1.0             
##  [59] tools_4.0.3                   cli_2.2.0                    
##  [61] generics_0.1.0                RSQLite_2.2.1                
##  [63] broom_0.7.2                   evaluate_0.14                
##  [65] fastmap_1.0.1                 yaml_2.2.1                   
##  [67] knitr_1.30                    bit64_4.0.5                  
##  [69] fs_1.5.0                      mime_0.9                     
##  [71] xml2_1.3.2                    biomaRt_2.46.0               
##  [73] compiler_4.0.3                rstudioapi_0.13              
##  [75] curl_4.3                      interactiveDisplayBase_1.28.0
##  [77] reprex_0.3.0                  geneplotter_1.68.0           
##  [79] stringi_1.5.3                 ps_1.5.0                     
##  [81] lattice_0.20-41               ProtGenerics_1.22.0          
##  [83] Matrix_1.2-18                 vctrs_0.3.5                  
##  [85] pillar_1.4.7                  lifecycle_0.2.0              
##  [87] BiocManager_1.30.10           bitops_1.0-6                 
##  [89] irlba_2.3.3                   httpuv_1.5.4                 
##  [91] rtracklayer_1.50.0            R6_2.5.0                     
##  [93] promises_1.1.1                MASS_7.3-53                  
##  [95] assertthat_0.2.1              openssl_1.4.3                
##  [97] withr_2.3.0                   GenomicAlignments_1.26.0     
##  [99] Rsamtools_2.6.0               GenomeInfoDbData_1.2.4       
## [101] hms_0.5.3                     grid_4.0.3                   
## [103] coda_0.19-4                   mixsqp_0.3-43                
## [105] bbmle_1.0.23.1                numDeriv_2016.8-1.1          
## [107] shiny_1.5.0                   lubridate_1.7.9.2